{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c48e25f0-ea40-4d2c-a973-6e9e0c5ee2ca",
   "metadata": {},
   "source": [
    "**** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
    "\n",
    "    make venv \n",
    "    source venv/bin/activate \n",
    "    pip install jupyterlab\n",
    "    venv/bin/jupyter lab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "61c27117-f8bb-4355-b9e3-21a67fa4a56d",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "## This is here as a reference only\n",
    "# Users and application developers must use the right tag for the latest from pypi\n",
    "%pip install 'data-prep-toolkit-transforms[bloom]'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9cae39c8-0f7b-4fc0-bea0-b707fb16bbfc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "import glob\n",
    "import pandas as pd\n",
    "from hashlib import sha256\n",
    "from pickle import dumps\n",
    "from huggingface_hub import list_repo_files, hf_hub_download\n",
    "from rbloom import Bloom\n",
    "from dpk_bloom.transform import BLOOMTransform\n",
    "from data_processing.data_access import DataAccessLocal"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9631e4ec-be99-4dc7-8007-e6a190a9420b",
   "metadata": {},
   "source": [
    "**** specify HuggingFace repo ID and bloom filter model\n",
    "\n",
    "- REPO_ID: Specifies the HuggingFace repository ID. Defaults to 'HuggingFaceFW/fineweb'.\n",
    "- SNAPSHOT: Defines the snapshot version, defaulting to CC-MAIN-2024-10. You may specify other available snapshots of FineWeb data.\n",
    "- BLOOM_MODEL: Indicates the IBM's GneissWeb Bloom filter model, which is sourced from HuggingFace.\n",
    "- batch_size: Adjust based on infrastructure capacity. The default value is 1000."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "0a6b8341-4952-407b-ad63-342200583c06",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Configuration\n",
    "REPO_ID = \"HuggingFaceFW/fineweb\"\n",
    "SNAPSHOT = \"data/CC-MAIN-2024-10\"\n",
    "BLOOM_MODEL = \"ibm-granite/GneissWeb.bloom\"\n",
    "BATCH_SIZE = 1000\n",
    "credentials = os.environ.get('HF_READ_ACCESS_TOKEN', \"YOUR_HF_TOKEN\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d3d3922-83bb-4715-9963-84c764f33a20",
   "metadata": {},
   "source": [
    "**** Fetch a specific Parquet file from a snapshot of Hugging Face's FineWeb dataset. idx corresponds to the {idx}-th Parquet file in the snapshot. Defaults to the first parquet file (idx=0)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0bee31af-2a9a-4e73-87a1-99eb0be86c62",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_parquet_path(repo_id, snapshot, idx=0):\n",
    "    files = sorted(\n",
    "        f for f in list_repo_files(repo_id, repo_type=\"dataset\")\n",
    "        if f.startswith(snapshot) and f.endswith(\".parquet\")\n",
    "    )\n",
    "    \n",
    "    if not files:\n",
    "        raise FileNotFoundError(f\"No Parquet files found in snapshot: {snapshot}\")\n",
    "\n",
    "    print(f\"Snapshot {snapshot} contains {len(files)} Parquet files.\")\n",
    "    file_path = hf_hub_download(repo_id=repo_id, filename=files[idx], repo_type=\"dataset\", token=credentials)\n",
    "    print(f\"Downloaded {idx}th Parquet file: {file_path}\")\n",
    "    return file_path"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "58284f42-e7b9-46c3-a65d-54299a943d95",
   "metadata": {},
   "source": [
    "**** input_folder is the path of the {idx}-th Parquet file in the snapshot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "68e0d116-5065-4540-a2cf-7ed41fe1d8c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Snapshot data/CC-MAIN-2024-10 contains 300 Parquet files.\n",
      "Downloaded 0th Parquet file: /Users/ian/.cache/huggingface/hub/datasets--HuggingFaceFW--fineweb/snapshots/0f039043b23fe1d4eed300b504aa4b4a68f1c7ba/data/CC-MAIN-2024-10/000_00000.parquet\n",
      "ibm-granite/GneissWeb.bloom is dnowloaded and cashed here: /Users/ian/.cache/huggingface/hub/models--ibm-granite--GneissWeb.bloom/snapshots/a2db4b89e557d1865f5b5c968768b9fbdf371d18/gneissweb.bloom\n"
     ]
    }
   ],
   "source": [
    "# Setup paths\n",
    "input_folder = load_parquet_path(repo_id=REPO_ID, snapshot=SNAPSHOT, idx=0)\n",
    "output_folder = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), \"output\"))\n",
    "\n",
    "model_path = hf_hub_download(repo_id=BLOOM_MODEL, filename=\"gneissweb.bloom\", token=credentials)\n",
    "print(f\"{BLOOM_MODEL} is dnowloaded and cashed here: {model_path}\")\n",
    "\n",
    "# Initialize local data access\n",
    "data_access = DataAccessLocal({\"input_folder\": input_folder, \"output_folder\": output_folder})\n",
    "\n",
    "# Load table\n",
    "table, _ = data_access.get_table(input_folder)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5261ecf-a885-4ff1-b180-fc69cb2e21c5",
   "metadata": {},
   "source": [
    "**** initalize Bloom transform class. BLOOM_MODEL Defaults to \"ibm-granite/GneissWeb.bloom\", which is 28GB in size and may take several minutes to download. **Once downloaded, it is cached and will be reused next time when calling BLOOMTransform**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b8a1ea5e-5da2-4b41-88d5-09a117da412c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply BLOOM transform\n",
    "transform = BLOOMTransform({\n",
    "    \"model_name_or_path\": model_path,\n",
    "    \"annotation_column\": \"is_in_GneissWeb\",\n",
    "    \"doc_text_column\": \"contents\",\n",
    "    \"inference_engine\": \"CPU\",\n",
    "    \"batch_size\": BATCH_SIZE,\n",
    "    \"data_access\": data_access\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ee1b0b56-442a-4928-a0d0-acbbefac1afa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing batch: 0/973\n",
      "Processing batch: 1/973\n",
      "Processing batch: 2/973\n",
      "Processing batch: 3/973\n",
      "Processing batch: 4/973\n",
      "Processing batch: 5/973\n",
      "Processing batch: 6/973\n",
      "Processing batch: 7/973\n",
      "Processing batch: 8/973\n",
      "Processing batch: 9/973\n",
      "Processing batch: 10/973\n",
      "Processing batch: 11/973\n",
      "Processing batch: 12/973\n",
      "Processing batch: 13/973\n",
      "Processing batch: 14/973\n",
      "Processing batch: 15/973\n",
      "Processing batch: 16/973\n",
      "Processing batch: 17/973\n",
      "Processing batch: 18/973\n",
      "Processing batch: 19/973\n",
      "Processing batch: 20/973\n",
      "Processing batch: 21/973\n",
      "Processing batch: 22/973\n",
      "Processing batch: 23/973\n",
      "Processing batch: 24/973\n",
      "Processing batch: 25/973\n",
      "Processing batch: 26/973\n",
      "Processing batch: 27/973\n",
      "Processing batch: 28/973\n",
      "Processing batch: 29/973\n",
      "Processing batch: 30/973\n",
      "Processing batch: 31/973\n",
      "Processing batch: 32/973\n",
      "Processing batch: 33/973\n",
      "Processing batch: 34/973\n",
      "Processing batch: 35/973\n",
      "Processing batch: 36/973\n",
      "Processing batch: 37/973\n",
      "Processing batch: 38/973\n",
      "Processing batch: 39/973\n",
      "Processing batch: 40/973\n",
      "Processing batch: 41/973\n",
      "Processing batch: 42/973\n",
      "Processing batch: 43/973\n",
      "Processing batch: 44/973\n",
      "Processing batch: 45/973\n",
      "Processing batch: 46/973\n",
      "Processing batch: 47/973\n",
      "Processing batch: 48/973\n",
      "Processing batch: 49/973\n",
      "Processing batch: 50/973\n",
      "Processing batch: 51/973\n",
      "Processing batch: 52/973\n",
      "Processing batch: 53/973\n",
      "Processing batch: 54/973\n",
      "Processing batch: 55/973\n",
      "Processing batch: 56/973\n",
      "Processing batch: 57/973\n",
      "Processing batch: 58/973\n",
      "Processing batch: 59/973\n",
      "Processing batch: 60/973\n",
      "Processing batch: 61/973\n",
      "Processing batch: 62/973\n",
      "Processing batch: 63/973\n",
      "Processing batch: 64/973\n",
      "Processing batch: 65/973\n",
      "Processing batch: 66/973\n",
      "Processing batch: 67/973\n",
      "Processing batch: 68/973\n",
      "Processing batch: 69/973\n",
      "Processing batch: 70/973\n",
      "Processing batch: 71/973\n",
      "Processing batch: 72/973\n",
      "Processing batch: 73/973\n",
      "Processing batch: 74/973\n",
      "Processing batch: 75/973\n",
      "Processing batch: 76/973\n",
      "Processing batch: 77/973\n",
      "Processing batch: 78/973\n",
      "Processing batch: 79/973\n",
      "Processing batch: 80/973\n",
      "Processing batch: 81/973\n",
      "Processing batch: 82/973\n",
      "Processing batch: 83/973\n",
      "Processing batch: 84/973\n",
      "Processing batch: 85/973\n",
      "Processing batch: 86/973\n",
      "Processing batch: 87/973\n",
      "Processing batch: 88/973\n",
      "Processing batch: 89/973\n",
      "Processing batch: 90/973\n",
      "Processing batch: 91/973\n",
      "Processing batch: 92/973\n",
      "Processing batch: 93/973\n",
      "Processing batch: 94/973\n",
      "Processing batch: 95/973\n",
      "Processing batch: 96/973\n",
      "Processing batch: 97/973\n",
      "Processing batch: 98/973\n",
      "Processing batch: 99/973\n",
      "Processing batch: 100/973\n",
      "Processing batch: 101/973\n",
      "Processing batch: 102/973\n",
      "Processing batch: 103/973\n",
      "Processing batch: 104/973\n",
      "Processing batch: 105/973\n",
      "Processing batch: 106/973\n",
      "Processing batch: 107/973\n",
      "Processing batch: 108/973\n",
      "Processing batch: 109/973\n",
      "Processing batch: 110/973\n",
      "Processing batch: 111/973\n",
      "Processing batch: 112/973\n",
      "Processing batch: 113/973\n",
      "Processing batch: 114/973\n",
      "Processing batch: 115/973\n",
      "Processing batch: 116/973\n",
      "Processing batch: 117/973\n",
      "Processing batch: 118/973\n",
      "Processing batch: 119/973\n",
      "Processing batch: 120/973\n",
      "Processing batch: 121/973\n",
      "Processing batch: 122/973\n",
      "Processing batch: 123/973\n",
      "Processing batch: 124/973\n",
      "Processing batch: 125/973\n",
      "Processing batch: 126/973\n",
      "Processing batch: 127/973\n",
      "Processing batch: 128/973\n",
      "Processing batch: 129/973\n",
      "Processing batch: 130/973\n",
      "Processing batch: 131/973\n",
      "Processing batch: 132/973\n",
      "Processing batch: 133/973\n",
      "Processing batch: 134/973\n",
      "Processing batch: 135/973\n",
      "Processing batch: 136/973\n",
      "Processing batch: 137/973\n",
      "Processing batch: 138/973\n",
      "Processing batch: 139/973\n",
      "Processing batch: 140/973\n",
      "Processing batch: 141/973\n",
      "Processing batch: 142/973\n",
      "Processing batch: 143/973\n",
      "Processing batch: 144/973\n",
      "Processing batch: 145/973\n",
      "Processing batch: 146/973\n",
      "Processing batch: 147/973\n",
      "Processing batch: 148/973\n",
      "Processing batch: 149/973\n",
      "Processing batch: 150/973\n",
      "Processing batch: 151/973\n",
      "Processing batch: 152/973\n",
      "Processing batch: 153/973\n",
      "Processing batch: 154/973\n",
      "Processing batch: 155/973\n",
      "Processing batch: 156/973\n",
      "Processing batch: 157/973\n",
      "Processing batch: 158/973\n",
      "Processing batch: 159/973\n",
      "Processing batch: 160/973\n",
      "Processing batch: 161/973\n",
      "Processing batch: 162/973\n",
      "Processing batch: 163/973\n",
      "Processing batch: 164/973\n",
      "Processing batch: 165/973\n",
      "Processing batch: 166/973\n",
      "Processing batch: 167/973\n",
      "Processing batch: 168/973\n",
      "Processing batch: 169/973\n",
      "Processing batch: 170/973\n",
      "Processing batch: 171/973\n",
      "Processing batch: 172/973\n",
      "Processing batch: 173/973\n",
      "Processing batch: 174/973\n",
      "Processing batch: 175/973\n",
      "Processing batch: 176/973\n",
      "Processing batch: 177/973\n",
      "Processing batch: 178/973\n",
      "Processing batch: 179/973\n",
      "Processing batch: 180/973\n",
      "Processing batch: 181/973\n",
      "Processing batch: 182/973\n",
      "Processing batch: 183/973\n",
      "Processing batch: 184/973\n",
      "Processing batch: 185/973\n",
      "Processing batch: 186/973\n",
      "Processing batch: 187/973\n",
      "Processing batch: 188/973\n",
      "Processing batch: 189/973\n",
      "Processing batch: 190/973\n",
      "Processing batch: 191/973\n",
      "Processing batch: 192/973\n",
      "Processing batch: 193/973\n",
      "Processing batch: 194/973\n",
      "Processing batch: 195/973\n",
      "Processing batch: 196/973\n",
      "Processing batch: 197/973\n",
      "Processing batch: 198/973\n",
      "Processing batch: 199/973\n",
      "Processing batch: 200/973\n",
      "Processing batch: 201/973\n",
      "Processing batch: 202/973\n",
      "Processing batch: 203/973\n",
      "Processing batch: 204/973\n",
      "Processing batch: 205/973\n",
      "Processing batch: 206/973\n",
      "Processing batch: 207/973\n",
      "Processing batch: 208/973\n",
      "Processing batch: 209/973\n",
      "Processing batch: 210/973\n",
      "Processing batch: 211/973\n",
      "Processing batch: 212/973\n",
      "Processing batch: 213/973\n",
      "Processing batch: 214/973\n",
      "Processing batch: 215/973\n",
      "Processing batch: 216/973\n",
      "Processing batch: 217/973\n",
      "Processing batch: 218/973\n",
      "Processing batch: 219/973\n",
      "Processing batch: 220/973\n",
      "Processing batch: 221/973\n",
      "Processing batch: 222/973\n",
      "Processing batch: 223/973\n",
      "Processing batch: 224/973\n",
      "Processing batch: 225/973\n",
      "Processing batch: 226/973\n",
      "Processing batch: 227/973\n",
      "Processing batch: 228/973\n",
      "Processing batch: 229/973\n",
      "Processing batch: 230/973\n",
      "Processing batch: 231/973\n",
      "Processing batch: 232/973\n",
      "Processing batch: 233/973\n",
      "Processing batch: 234/973\n",
      "Processing batch: 235/973\n",
      "Processing batch: 236/973\n",
      "Processing batch: 237/973\n",
      "Processing batch: 238/973\n",
      "Processing batch: 239/973\n",
      "Processing batch: 240/973\n",
      "Processing batch: 241/973\n",
      "Processing batch: 242/973\n",
      "Processing batch: 243/973\n",
      "Processing batch: 244/973\n",
      "Processing batch: 245/973\n",
      "Processing batch: 246/973\n",
      "Processing batch: 247/973\n",
      "Processing batch: 248/973\n",
      "Processing batch: 249/973\n",
      "Processing batch: 250/973\n",
      "Processing batch: 251/973\n",
      "Processing batch: 252/973\n",
      "Processing batch: 253/973\n",
      "Processing batch: 254/973\n",
      "Processing batch: 255/973\n",
      "Processing batch: 256/973\n",
      "Processing batch: 257/973\n",
      "Processing batch: 258/973\n",
      "Processing batch: 259/973\n",
      "Processing batch: 260/973\n",
      "Processing batch: 261/973\n",
      "Processing batch: 262/973\n",
      "Processing batch: 263/973\n",
      "Processing batch: 264/973\n",
      "Processing batch: 265/973\n",
      "Processing batch: 266/973\n",
      "Processing batch: 267/973\n",
      "Processing batch: 268/973\n",
      "Processing batch: 269/973\n",
      "Processing batch: 270/973\n",
      "Processing batch: 271/973\n",
      "Processing batch: 272/973\n",
      "Processing batch: 273/973\n",
      "Processing batch: 274/973\n",
      "Processing batch: 275/973\n",
      "Processing batch: 276/973\n",
      "Processing batch: 277/973\n",
      "Processing batch: 278/973\n",
      "Processing batch: 279/973\n",
      "Processing batch: 280/973\n",
      "Processing batch: 281/973\n",
      "Processing batch: 282/973\n",
      "Processing batch: 283/973\n",
      "Processing batch: 284/973\n",
      "Processing batch: 285/973\n",
      "Processing batch: 286/973\n",
      "Processing batch: 287/973\n",
      "Processing batch: 288/973\n",
      "Processing batch: 289/973\n",
      "Processing batch: 290/973\n",
      "Processing batch: 291/973\n",
      "Processing batch: 292/973\n",
      "Processing batch: 293/973\n",
      "Processing batch: 294/973\n",
      "Processing batch: 295/973\n",
      "Processing batch: 296/973\n",
      "Processing batch: 297/973\n",
      "Processing batch: 298/973\n",
      "Processing batch: 299/973\n",
      "Processing batch: 300/973\n",
      "Processing batch: 301/973\n",
      "Processing batch: 302/973\n",
      "Processing batch: 303/973\n",
      "Processing batch: 304/973\n",
      "Processing batch: 305/973\n",
      "Processing batch: 306/973\n",
      "Processing batch: 307/973\n",
      "Processing batch: 308/973\n",
      "Processing batch: 309/973\n",
      "Processing batch: 310/973\n",
      "Processing batch: 311/973\n",
      "Processing batch: 312/973\n",
      "Processing batch: 313/973\n",
      "Processing batch: 314/973\n",
      "Processing batch: 315/973\n",
      "Processing batch: 316/973\n",
      "Processing batch: 317/973\n",
      "Processing batch: 318/973\n",
      "Processing batch: 319/973\n",
      "Processing batch: 320/973\n",
      "Processing batch: 321/973\n",
      "Processing batch: 322/973\n",
      "Processing batch: 323/973\n",
      "Processing batch: 324/973\n",
      "Processing batch: 325/973\n",
      "Processing batch: 326/973\n",
      "Processing batch: 327/973\n",
      "Processing batch: 328/973\n",
      "Processing batch: 329/973\n",
      "Processing batch: 330/973\n",
      "Processing batch: 331/973\n",
      "Processing batch: 332/973\n",
      "Processing batch: 333/973\n",
      "Processing batch: 334/973\n",
      "Processing batch: 335/973\n",
      "Processing batch: 336/973\n",
      "Processing batch: 337/973\n",
      "Processing batch: 338/973\n",
      "Processing batch: 339/973\n",
      "Processing batch: 340/973\n",
      "Processing batch: 341/973\n",
      "Processing batch: 342/973\n",
      "Processing batch: 343/973\n",
      "Processing batch: 344/973\n",
      "Processing batch: 345/973\n",
      "Processing batch: 346/973\n",
      "Processing batch: 347/973\n",
      "Processing batch: 348/973\n",
      "Processing batch: 349/973\n",
      "Processing batch: 350/973\n",
      "Processing batch: 351/973\n",
      "Processing batch: 352/973\n",
      "Processing batch: 353/973\n",
      "Processing batch: 354/973\n",
      "Processing batch: 355/973\n",
      "Processing batch: 356/973\n",
      "Processing batch: 357/973\n",
      "Processing batch: 358/973\n",
      "Processing batch: 359/973\n",
      "Processing batch: 360/973\n",
      "Processing batch: 361/973\n",
      "Processing batch: 362/973\n",
      "Processing batch: 363/973\n",
      "Processing batch: 364/973\n",
      "Processing batch: 365/973\n",
      "Processing batch: 366/973\n",
      "Processing batch: 367/973\n",
      "Processing batch: 368/973\n",
      "Processing batch: 369/973\n",
      "Processing batch: 370/973\n",
      "Processing batch: 371/973\n",
      "Processing batch: 372/973\n",
      "Processing batch: 373/973\n",
      "Processing batch: 374/973\n",
      "Processing batch: 375/973\n",
      "Processing batch: 376/973\n",
      "Processing batch: 377/973\n",
      "Processing batch: 378/973\n",
      "Processing batch: 379/973\n",
      "Processing batch: 380/973\n",
      "Processing batch: 381/973\n",
      "Processing batch: 382/973\n",
      "Processing batch: 383/973\n",
      "Processing batch: 384/973\n",
      "Processing batch: 385/973\n",
      "Processing batch: 386/973\n",
      "Processing batch: 387/973\n",
      "Processing batch: 388/973\n",
      "Processing batch: 389/973\n",
      "Processing batch: 390/973\n",
      "Processing batch: 391/973\n",
      "Processing batch: 392/973\n",
      "Processing batch: 393/973\n",
      "Processing batch: 394/973\n",
      "Processing batch: 395/973\n",
      "Processing batch: 396/973\n",
      "Processing batch: 397/973\n",
      "Processing batch: 398/973\n",
      "Processing batch: 399/973\n",
      "Processing batch: 400/973\n",
      "Processing batch: 401/973\n",
      "Processing batch: 402/973\n",
      "Processing batch: 403/973\n",
      "Processing batch: 404/973\n",
      "Processing batch: 405/973\n",
      "Processing batch: 406/973\n",
      "Processing batch: 407/973\n",
      "Processing batch: 408/973\n",
      "Processing batch: 409/973\n",
      "Processing batch: 410/973\n",
      "Processing batch: 411/973\n",
      "Processing batch: 412/973\n",
      "Processing batch: 413/973\n",
      "Processing batch: 414/973\n",
      "Processing batch: 415/973\n",
      "Processing batch: 416/973\n",
      "Processing batch: 417/973\n",
      "Processing batch: 418/973\n",
      "Processing batch: 419/973\n",
      "Processing batch: 420/973\n",
      "Processing batch: 421/973\n",
      "Processing batch: 422/973\n",
      "Processing batch: 423/973\n",
      "Processing batch: 424/973\n",
      "Processing batch: 425/973\n",
      "Processing batch: 426/973\n",
      "Processing batch: 427/973\n",
      "Processing batch: 428/973\n",
      "Processing batch: 429/973\n",
      "Processing batch: 430/973\n",
      "Processing batch: 431/973\n",
      "Processing batch: 432/973\n",
      "Processing batch: 433/973\n",
      "Processing batch: 434/973\n",
      "Processing batch: 435/973\n",
      "Processing batch: 436/973\n",
      "Processing batch: 437/973\n",
      "Processing batch: 438/973\n",
      "Processing batch: 439/973\n",
      "Processing batch: 440/973\n",
      "Processing batch: 441/973\n",
      "Processing batch: 442/973\n",
      "Processing batch: 443/973\n",
      "Processing batch: 444/973\n",
      "Processing batch: 445/973\n",
      "Processing batch: 446/973\n",
      "Processing batch: 447/973\n",
      "Processing batch: 448/973\n",
      "Processing batch: 449/973\n",
      "Processing batch: 450/973\n",
      "Processing batch: 451/973\n",
      "Processing batch: 452/973\n",
      "Processing batch: 453/973\n",
      "Processing batch: 454/973\n",
      "Processing batch: 455/973\n",
      "Processing batch: 456/973\n",
      "Processing batch: 457/973\n",
      "Processing batch: 458/973\n",
      "Processing batch: 459/973\n",
      "Processing batch: 460/973\n",
      "Processing batch: 461/973\n",
      "Processing batch: 462/973\n",
      "Processing batch: 463/973\n",
      "Processing batch: 464/973\n",
      "Processing batch: 465/973\n",
      "Processing batch: 466/973\n",
      "Processing batch: 467/973\n",
      "Processing batch: 468/973\n",
      "Processing batch: 469/973\n",
      "Processing batch: 470/973\n",
      "Processing batch: 471/973\n",
      "Processing batch: 472/973\n",
      "Processing batch: 473/973\n",
      "Processing batch: 474/973\n",
      "Processing batch: 475/973\n",
      "Processing batch: 476/973\n",
      "Processing batch: 477/973\n",
      "Processing batch: 478/973\n",
      "Processing batch: 479/973\n",
      "Processing batch: 480/973\n",
      "Processing batch: 481/973\n",
      "Processing batch: 482/973\n",
      "Processing batch: 483/973\n",
      "Processing batch: 484/973\n",
      "Processing batch: 485/973\n",
      "Processing batch: 486/973\n",
      "Processing batch: 487/973\n",
      "Processing batch: 488/973\n",
      "Processing batch: 489/973\n",
      "Processing batch: 490/973\n",
      "Processing batch: 491/973\n",
      "Processing batch: 492/973\n",
      "Processing batch: 493/973\n",
      "Processing batch: 494/973\n",
      "Processing batch: 495/973\n",
      "Processing batch: 496/973\n",
      "Processing batch: 497/973\n",
      "Processing batch: 498/973\n",
      "Processing batch: 499/973\n",
      "Processing batch: 500/973\n",
      "Processing batch: 501/973\n",
      "Processing batch: 502/973\n",
      "Processing batch: 503/973\n",
      "Processing batch: 504/973\n",
      "Processing batch: 505/973\n",
      "Processing batch: 506/973\n",
      "Processing batch: 507/973\n",
      "Processing batch: 508/973\n",
      "Processing batch: 509/973\n",
      "Processing batch: 510/973\n",
      "Processing batch: 511/973\n",
      "Processing batch: 512/973\n",
      "Processing batch: 513/973\n",
      "Processing batch: 514/973\n",
      "Processing batch: 515/973\n",
      "Processing batch: 516/973\n",
      "Processing batch: 517/973\n",
      "Processing batch: 518/973\n",
      "Processing batch: 519/973\n",
      "Processing batch: 520/973\n",
      "Processing batch: 521/973\n",
      "Processing batch: 522/973\n",
      "Processing batch: 523/973\n",
      "Processing batch: 524/973\n",
      "Processing batch: 525/973\n",
      "Processing batch: 526/973\n",
      "Processing batch: 527/973\n",
      "Processing batch: 528/973\n",
      "Processing batch: 529/973\n",
      "Processing batch: 530/973\n",
      "Processing batch: 531/973\n",
      "Processing batch: 532/973\n",
      "Processing batch: 533/973\n",
      "Processing batch: 534/973\n",
      "Processing batch: 535/973\n",
      "Processing batch: 536/973\n",
      "Processing batch: 537/973\n",
      "Processing batch: 538/973\n",
      "Processing batch: 539/973\n",
      "Processing batch: 540/973\n",
      "Processing batch: 541/973\n",
      "Processing batch: 542/973\n",
      "Processing batch: 543/973\n",
      "Processing batch: 544/973\n",
      "Processing batch: 545/973\n",
      "Processing batch: 546/973\n",
      "Processing batch: 547/973\n",
      "Processing batch: 548/973\n",
      "Processing batch: 549/973\n",
      "Processing batch: 550/973\n",
      "Processing batch: 551/973\n",
      "Processing batch: 552/973\n",
      "Processing batch: 553/973\n",
      "Processing batch: 554/973\n",
      "Processing batch: 555/973\n",
      "Processing batch: 556/973\n",
      "Processing batch: 557/973\n",
      "Processing batch: 558/973\n",
      "Processing batch: 559/973\n",
      "Processing batch: 560/973\n",
      "Processing batch: 561/973\n",
      "Processing batch: 562/973\n",
      "Processing batch: 563/973\n",
      "Processing batch: 564/973\n",
      "Processing batch: 565/973\n",
      "Processing batch: 566/973\n",
      "Processing batch: 567/973\n",
      "Processing batch: 568/973\n",
      "Processing batch: 569/973\n",
      "Processing batch: 570/973\n",
      "Processing batch: 571/973\n",
      "Processing batch: 572/973\n",
      "Processing batch: 573/973\n",
      "Processing batch: 574/973\n",
      "Processing batch: 575/973\n",
      "Processing batch: 576/973\n",
      "Processing batch: 577/973\n",
      "Processing batch: 578/973\n",
      "Processing batch: 579/973\n",
      "Processing batch: 580/973\n",
      "Processing batch: 581/973\n",
      "Processing batch: 582/973\n",
      "Processing batch: 583/973\n",
      "Processing batch: 584/973\n",
      "Processing batch: 585/973\n",
      "Processing batch: 586/973\n",
      "Processing batch: 587/973\n",
      "Processing batch: 588/973\n",
      "Processing batch: 589/973\n",
      "Processing batch: 590/973\n",
      "Processing batch: 591/973\n",
      "Processing batch: 592/973\n",
      "Processing batch: 593/973\n",
      "Processing batch: 594/973\n",
      "Processing batch: 595/973\n",
      "Processing batch: 596/973\n",
      "Processing batch: 597/973\n",
      "Processing batch: 598/973\n",
      "Processing batch: 599/973\n",
      "Processing batch: 600/973\n",
      "Processing batch: 601/973\n",
      "Processing batch: 602/973\n",
      "Processing batch: 603/973\n",
      "Processing batch: 604/973\n",
      "Processing batch: 605/973\n",
      "Processing batch: 606/973\n",
      "Processing batch: 607/973\n",
      "Processing batch: 608/973\n",
      "Processing batch: 609/973\n",
      "Processing batch: 610/973\n",
      "Processing batch: 611/973\n",
      "Processing batch: 612/973\n",
      "Processing batch: 613/973\n",
      "Processing batch: 614/973\n",
      "Processing batch: 615/973\n",
      "Processing batch: 616/973\n",
      "Processing batch: 617/973\n",
      "Processing batch: 618/973\n",
      "Processing batch: 619/973\n",
      "Processing batch: 620/973\n",
      "Processing batch: 621/973\n",
      "Processing batch: 622/973\n",
      "Processing batch: 623/973\n",
      "Processing batch: 624/973\n",
      "Processing batch: 625/973\n",
      "Processing batch: 626/973\n",
      "Processing batch: 627/973\n",
      "Processing batch: 628/973\n",
      "Processing batch: 629/973\n",
      "Processing batch: 630/973\n",
      "Processing batch: 631/973\n",
      "Processing batch: 632/973\n",
      "Processing batch: 633/973\n",
      "Processing batch: 634/973\n",
      "Processing batch: 635/973\n",
      "Processing batch: 636/973\n",
      "Processing batch: 637/973\n",
      "Processing batch: 638/973\n",
      "Processing batch: 639/973\n",
      "Processing batch: 640/973\n",
      "Processing batch: 641/973\n",
      "Processing batch: 642/973\n",
      "Processing batch: 643/973\n",
      "Processing batch: 644/973\n",
      "Processing batch: 645/973\n",
      "Processing batch: 646/973\n",
      "Processing batch: 647/973\n",
      "Processing batch: 648/973\n",
      "Processing batch: 649/973\n",
      "Processing batch: 650/973\n",
      "Processing batch: 651/973\n",
      "Processing batch: 652/973\n",
      "Processing batch: 653/973\n",
      "Processing batch: 654/973\n",
      "Processing batch: 655/973\n",
      "Processing batch: 656/973\n",
      "Processing batch: 657/973\n",
      "Processing batch: 658/973\n",
      "Processing batch: 659/973\n",
      "Processing batch: 660/973\n",
      "Processing batch: 661/973\n",
      "Processing batch: 662/973\n",
      "Processing batch: 663/973\n",
      "Processing batch: 664/973\n",
      "Processing batch: 665/973\n",
      "Processing batch: 666/973\n",
      "Processing batch: 667/973\n",
      "Processing batch: 668/973\n",
      "Processing batch: 669/973\n",
      "Processing batch: 670/973\n",
      "Processing batch: 671/973\n",
      "Processing batch: 672/973\n",
      "Processing batch: 673/973\n",
      "Processing batch: 674/973\n",
      "Processing batch: 675/973\n",
      "Processing batch: 676/973\n",
      "Processing batch: 677/973\n",
      "Processing batch: 678/973\n",
      "Processing batch: 679/973\n",
      "Processing batch: 680/973\n",
      "Processing batch: 681/973\n",
      "Processing batch: 682/973\n",
      "Processing batch: 683/973\n",
      "Processing batch: 684/973\n",
      "Processing batch: 685/973\n",
      "Processing batch: 686/973\n",
      "Processing batch: 687/973\n",
      "Processing batch: 688/973\n",
      "Processing batch: 689/973\n",
      "Processing batch: 690/973\n",
      "Processing batch: 691/973\n",
      "Processing batch: 692/973\n",
      "Processing batch: 693/973\n",
      "Processing batch: 694/973\n",
      "Processing batch: 695/973\n",
      "Processing batch: 696/973\n",
      "Processing batch: 697/973\n",
      "Processing batch: 698/973\n",
      "Processing batch: 699/973\n",
      "Processing batch: 700/973\n",
      "Processing batch: 701/973\n",
      "Processing batch: 702/973\n",
      "Processing batch: 703/973\n",
      "Processing batch: 704/973\n",
      "Processing batch: 705/973\n",
      "Processing batch: 706/973\n",
      "Processing batch: 707/973\n",
      "Processing batch: 708/973\n",
      "Processing batch: 709/973\n",
      "Processing batch: 710/973\n",
      "Processing batch: 711/973\n",
      "Processing batch: 712/973\n",
      "Processing batch: 713/973\n",
      "Processing batch: 714/973\n",
      "Processing batch: 715/973\n",
      "Processing batch: 716/973\n",
      "Processing batch: 717/973\n",
      "Processing batch: 718/973\n",
      "Processing batch: 719/973\n",
      "Processing batch: 720/973\n",
      "Processing batch: 721/973\n",
      "Processing batch: 722/973\n",
      "Processing batch: 723/973\n",
      "Processing batch: 724/973\n",
      "Processing batch: 725/973\n",
      "Processing batch: 726/973\n",
      "Processing batch: 727/973\n",
      "Processing batch: 728/973\n",
      "Processing batch: 729/973\n",
      "Processing batch: 730/973\n",
      "Processing batch: 731/973\n",
      "Processing batch: 732/973\n",
      "Processing batch: 733/973\n",
      "Processing batch: 734/973\n",
      "Processing batch: 735/973\n",
      "Processing batch: 736/973\n",
      "Processing batch: 737/973\n",
      "Processing batch: 738/973\n",
      "Processing batch: 739/973\n",
      "Processing batch: 740/973\n",
      "Processing batch: 741/973\n",
      "Processing batch: 742/973\n",
      "Processing batch: 743/973\n",
      "Processing batch: 744/973\n",
      "Processing batch: 745/973\n",
      "Processing batch: 746/973\n",
      "Processing batch: 747/973\n",
      "Processing batch: 748/973\n",
      "Processing batch: 749/973\n",
      "Processing batch: 750/973\n",
      "Processing batch: 751/973\n",
      "Processing batch: 752/973\n",
      "Processing batch: 753/973\n",
      "Processing batch: 754/973\n",
      "Processing batch: 755/973\n",
      "Processing batch: 756/973\n",
      "Processing batch: 757/973\n",
      "Processing batch: 758/973\n",
      "Processing batch: 759/973\n",
      "Processing batch: 760/973\n",
      "Processing batch: 761/973\n",
      "Processing batch: 762/973\n",
      "Processing batch: 763/973\n",
      "Processing batch: 764/973\n",
      "Processing batch: 765/973\n",
      "Processing batch: 766/973\n",
      "Processing batch: 767/973\n",
      "Processing batch: 768/973\n",
      "Processing batch: 769/973\n",
      "Processing batch: 770/973\n",
      "Processing batch: 771/973\n",
      "Processing batch: 772/973\n",
      "Processing batch: 773/973\n",
      "Processing batch: 774/973\n",
      "Processing batch: 775/973\n",
      "Processing batch: 776/973\n",
      "Processing batch: 777/973\n",
      "Processing batch: 778/973\n",
      "Processing batch: 779/973\n",
      "Processing batch: 780/973\n",
      "Processing batch: 781/973\n",
      "Processing batch: 782/973\n",
      "Processing batch: 783/973\n",
      "Processing batch: 784/973\n",
      "Processing batch: 785/973\n",
      "Processing batch: 786/973\n",
      "Processing batch: 787/973\n",
      "Processing batch: 788/973\n",
      "Processing batch: 789/973\n",
      "Processing batch: 790/973\n",
      "Processing batch: 791/973\n",
      "Processing batch: 792/973\n",
      "Processing batch: 793/973\n",
      "Processing batch: 794/973\n",
      "Processing batch: 795/973\n",
      "Processing batch: 796/973\n",
      "Processing batch: 797/973\n",
      "Processing batch: 798/973\n",
      "Processing batch: 799/973\n",
      "Processing batch: 800/973\n",
      "Processing batch: 801/973\n",
      "Processing batch: 802/973\n",
      "Processing batch: 803/973\n",
      "Processing batch: 804/973\n",
      "Processing batch: 805/973\n",
      "Processing batch: 806/973\n",
      "Processing batch: 807/973\n",
      "Processing batch: 808/973\n",
      "Processing batch: 809/973\n",
      "Processing batch: 810/973\n",
      "Processing batch: 811/973\n",
      "Processing batch: 812/973\n",
      "Processing batch: 813/973\n",
      "Processing batch: 814/973\n",
      "Processing batch: 815/973\n",
      "Processing batch: 816/973\n",
      "Processing batch: 817/973\n",
      "Processing batch: 818/973\n",
      "Processing batch: 819/973\n",
      "Processing batch: 820/973\n",
      "Processing batch: 821/973\n",
      "Processing batch: 822/973\n",
      "Processing batch: 823/973\n",
      "Processing batch: 824/973\n",
      "Processing batch: 825/973\n",
      "Processing batch: 826/973\n",
      "Processing batch: 827/973\n",
      "Processing batch: 828/973\n",
      "Processing batch: 829/973\n",
      "Processing batch: 830/973\n",
      "Processing batch: 831/973\n",
      "Processing batch: 832/973\n",
      "Processing batch: 833/973\n",
      "Processing batch: 834/973\n",
      "Processing batch: 835/973\n",
      "Processing batch: 836/973\n",
      "Processing batch: 837/973\n",
      "Processing batch: 838/973\n",
      "Processing batch: 839/973\n",
      "Processing batch: 840/973\n",
      "Processing batch: 841/973\n",
      "Processing batch: 842/973\n",
      "Processing batch: 843/973\n",
      "Processing batch: 844/973\n",
      "Processing batch: 845/973\n",
      "Processing batch: 846/973\n",
      "Processing batch: 847/973\n",
      "Processing batch: 848/973\n",
      "Processing batch: 849/973\n",
      "Processing batch: 850/973\n",
      "Processing batch: 851/973\n",
      "Processing batch: 852/973\n",
      "Processing batch: 853/973\n",
      "Processing batch: 854/973\n",
      "Processing batch: 855/973\n",
      "Processing batch: 856/973\n",
      "Processing batch: 857/973\n",
      "Processing batch: 858/973\n",
      "Processing batch: 859/973\n",
      "Processing batch: 860/973\n",
      "Processing batch: 861/973\n",
      "Processing batch: 862/973\n",
      "Processing batch: 863/973\n",
      "Processing batch: 864/973\n",
      "Processing batch: 865/973\n",
      "Processing batch: 866/973\n",
      "Processing batch: 867/973\n",
      "Processing batch: 868/973\n",
      "Processing batch: 869/973\n",
      "Processing batch: 870/973\n",
      "Processing batch: 871/973\n",
      "Processing batch: 872/973\n",
      "Processing batch: 873/973\n",
      "Processing batch: 874/973\n",
      "Processing batch: 875/973\n",
      "Processing batch: 876/973\n",
      "Processing batch: 877/973\n",
      "Processing batch: 878/973\n",
      "Processing batch: 879/973\n",
      "Processing batch: 880/973\n",
      "Processing batch: 881/973\n",
      "Processing batch: 882/973\n",
      "Processing batch: 883/973\n",
      "Processing batch: 884/973\n",
      "Processing batch: 885/973\n",
      "Processing batch: 886/973\n",
      "Processing batch: 887/973\n",
      "Processing batch: 888/973\n",
      "Processing batch: 889/973\n",
      "Processing batch: 890/973\n",
      "Processing batch: 891/973\n",
      "Processing batch: 892/973\n",
      "Processing batch: 893/973\n",
      "Processing batch: 894/973\n",
      "Processing batch: 895/973\n",
      "Processing batch: 896/973\n",
      "Processing batch: 897/973\n",
      "Processing batch: 898/973\n",
      "Processing batch: 899/973\n",
      "Processing batch: 900/973\n",
      "Processing batch: 901/973\n",
      "Processing batch: 902/973\n",
      "Processing batch: 903/973\n",
      "Processing batch: 904/973\n",
      "Processing batch: 905/973\n",
      "Processing batch: 906/973\n",
      "Processing batch: 907/973\n",
      "Processing batch: 908/973\n",
      "Processing batch: 909/973\n",
      "Processing batch: 910/973\n",
      "Processing batch: 911/973\n",
      "Processing batch: 912/973\n",
      "Processing batch: 913/973\n",
      "Processing batch: 914/973\n",
      "Processing batch: 915/973\n",
      "Processing batch: 916/973\n",
      "Processing batch: 917/973\n",
      "Processing batch: 918/973\n",
      "Processing batch: 919/973\n",
      "Processing batch: 920/973\n",
      "Processing batch: 921/973\n",
      "Processing batch: 922/973\n",
      "Processing batch: 923/973\n",
      "Processing batch: 924/973\n",
      "Processing batch: 925/973\n",
      "Processing batch: 926/973\n",
      "Processing batch: 927/973\n",
      "Processing batch: 928/973\n",
      "Processing batch: 929/973\n",
      "Processing batch: 930/973\n",
      "Processing batch: 931/973\n",
      "Processing batch: 932/973\n",
      "Processing batch: 933/973\n",
      "Processing batch: 934/973\n",
      "Processing batch: 935/973\n",
      "Processing batch: 936/973\n",
      "Processing batch: 937/973\n",
      "Processing batch: 938/973\n",
      "Processing batch: 939/973\n",
      "Processing batch: 940/973\n",
      "Processing batch: 941/973\n",
      "Processing batch: 942/973\n",
      "Processing batch: 943/973\n",
      "Processing batch: 944/973\n",
      "Processing batch: 945/973\n",
      "Processing batch: 946/973\n",
      "Processing batch: 947/973\n",
      "Processing batch: 948/973\n",
      "Processing batch: 949/973\n",
      "Processing batch: 950/973\n",
      "Processing batch: 951/973\n",
      "Processing batch: 952/973\n",
      "Processing batch: 953/973\n",
      "Processing batch: 954/973\n",
      "Processing batch: 955/973\n",
      "Processing batch: 956/973\n",
      "Processing batch: 957/973\n",
      "Processing batch: 958/973\n",
      "Processing batch: 959/973\n",
      "Processing batch: 960/973\n",
      "Processing batch: 961/973\n",
      "Processing batch: 962/973\n",
      "Processing batch: 963/973\n",
      "Processing batch: 964/973\n",
      "Processing batch: 965/973\n",
      "Processing batch: 966/973\n",
      "Processing batch: 967/973\n",
      "Processing batch: 968/973\n",
      "Processing batch: 969/973\n",
      "Processing batch: 970/973\n",
      "Processing batch: 971/973\n",
      "Processing batch: 972/973\n",
      "Processing batch: 973/973\n",
      "                                                     text  \\\n",
      "0       However it is not simply those who are traditi...   \n",
      "1       That is to say, it’s a major departure from th...   \n",
      "2       Our students are everything.\\nAt Mayfield Juni...   \n",
      "3       Slapstick Comedy Returns!\\n2’30 X 52 | 6 to 12...   \n",
      "4       A game by Ismael Rodriguez for PC, originally ...   \n",
      "...                                                   ...   \n",
      "973986  1 GB RAM\\n20 GB Storage\\n2 GB RAM\\n40 GB Stora...   \n",
      "973987  Shopping for a hot tub can be fun. But before ...   \n",
      "973988  Houston, TX (Sports Network) – Jeff Keppinger ...   \n",
      "973989  His Holiness the Dalai Lama will release a new...   \n",
      "973990  So after I posted last night I made the mistak...   \n",
      "\n",
      "                                                     id             dump  \\\n",
      "0       <urn:uuid:8cd4575d-befe-45c5-9dea-9d76a6b1a93c>  CC-MAIN-2024-10   \n",
      "1       <urn:uuid:4dd21ed3-d616-4748-8c7b-b111db5ec1f5>  CC-MAIN-2024-10   \n",
      "2       <urn:uuid:f66f9cbd-dff6-4625-aae8-28402b8c176f>  CC-MAIN-2024-10   \n",
      "3       <urn:uuid:34357d20-1edd-433a-b334-408893e6eb81>  CC-MAIN-2024-10   \n",
      "4       <urn:uuid:74c46249-3a4e-43ec-9be4-579b7ef7ded8>  CC-MAIN-2024-10   \n",
      "...                                                 ...              ...   \n",
      "973986  <urn:uuid:675631f8-838d-46df-a8bc-14a8b846b44b>  CC-MAIN-2024-10   \n",
      "973987  <urn:uuid:a27f6b6a-a57f-477d-bb35-aa7619be8450>  CC-MAIN-2024-10   \n",
      "973988  <urn:uuid:50298437-f07d-494e-915d-d801ecfb7d02>  CC-MAIN-2024-10   \n",
      "973989  <urn:uuid:f0776643-dd9b-4af2-be44-f571e490c187>  CC-MAIN-2024-10   \n",
      "973990  <urn:uuid:81583e3d-4029-4ad6-bc7e-6eb5132471c4>  CC-MAIN-2024-10   \n",
      "\n",
      "                                                      url  \\\n",
      "0                         http://199cr.com/archives/25628   \n",
      "1              http://20literlife.com/manila-philippines/   \n",
      "2                                 http://4.gailroddy.com/   \n",
      "3                               http://5bricksstudio.com/   \n",
      "4       http://8bithorse.blogspot.com/2021/12/below-oc...   \n",
      "...                                                   ...   \n",
      "973986  https://hosting.steeledesignstudio.com/hosting...   \n",
      "973987  https://hottubownerhq.com/does-a-hot-tub-have-...   \n",
      "973988  https://houston.sbnation.com/2010/9/13/1687441...   \n",
      "973989  https://houstonseagle.com/1035689/dalai-lama-p...   \n",
      "973990  https://hoyosrevenge.com/2012/11/11/utah-footb...   \n",
      "\n",
      "                        date  \\\n",
      "0       2024-02-20T21:54:18Z   \n",
      "1       2024-02-20T22:48:53Z   \n",
      "2       2024-02-20T22:54:41Z   \n",
      "3       2024-02-20T23:14:25Z   \n",
      "4       2024-02-20T21:23:36Z   \n",
      "...                      ...   \n",
      "973986  2024-03-02T20:01:58Z   \n",
      "973987  2024-03-02T21:13:08Z   \n",
      "973988  2024-03-02T20:23:52Z   \n",
      "973989  2024-03-02T19:14:56Z   \n",
      "973990  2024-03-02T19:33:53Z   \n",
      "\n",
      "                                                file_path language  \\\n",
      "0       s3://commoncrawl/crawl-data/CC-MAIN-2024-10/se...       en   \n",
      "1       s3://commoncrawl/crawl-data/CC-MAIN-2024-10/se...       en   \n",
      "2       s3://commoncrawl/crawl-data/CC-MAIN-2024-10/se...       en   \n",
      "3       s3://commoncrawl/crawl-data/CC-MAIN-2024-10/se...       en   \n",
      "4       s3://commoncrawl/crawl-data/CC-MAIN-2024-10/se...       en   \n",
      "...                                                   ...      ...   \n",
      "973986  s3://commoncrawl/crawl-data/CC-MAIN-2024-10/se...       en   \n",
      "973987  s3://commoncrawl/crawl-data/CC-MAIN-2024-10/se...       en   \n",
      "973988  s3://commoncrawl/crawl-data/CC-MAIN-2024-10/se...       en   \n",
      "973989  s3://commoncrawl/crawl-data/CC-MAIN-2024-10/se...       en   \n",
      "973990  s3://commoncrawl/crawl-data/CC-MAIN-2024-10/se...       en   \n",
      "\n",
      "        language_score  token_count  is_in_GneissWeb  \n",
      "0             0.963783         1001                1  \n",
      "1             0.980274          635                0  \n",
      "2             0.944642          114                1  \n",
      "3             0.953095          174                1  \n",
      "4             0.959346         1757                0  \n",
      "...                ...          ...              ...  \n",
      "973986        0.897578          429                0  \n",
      "973987        0.948365         3007                0  \n",
      "973988        0.965555          478                0  \n",
      "973989        0.969655          248                0  \n",
      "973990        0.960338          420                0  \n",
      "\n",
      "[973991 rows x 10 columns]\n",
      "It took 0.6 mins to process 973991 documents\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['output/metadata.json', 'output/test1.parquet']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "time0 = time.time()\n",
    "table_list, metadata = transform.transform(table)\n",
    "time1 = time.time()\n",
    "print(f\"It took {(time1-time0)/float(60):.1f} mins to process {len(table)} documents\")\n",
    "\n",
    "import glob\n",
    "glob.glob(\"output/*\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "640d2c2f-fec2-4886-a025-5de7254ef883",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
